This notebook presents some analyses and visualization of singing voice activity detection results in Wagner's "Ring".
Author: Christof Weiß, AudioLabs Erlangen 2019
import glob
import os
import numpy as np
from scipy import signal
import pandas as pd
import sklearn.ensemble
import IPython.display as ipd
import librosa
from collections import OrderedDict
from matplotlib import pyplot as plt
from matplotlib import colors
import matplotlib.gridspec as gridspec
%matplotlib inline
Test piece: WWV 86 B 1 - "Die Walküre"
audio_file_name = os.path.join('D:', os.sep, 'WagnerLyrics','wav_22050_stereo', 'Wagner_WWV086B-1_Karajan1966_DG1998.wav')
# annot_file_name = os.path.join('D:', os.sep, 'WagnerLyrics','Annotations_SingingVoice_Audio', 'Wagner_WWV086B-1_Karajan1966_LyricsAnnotated.csv')
annot_file_name = os.path.join('D:', os.sep, 'WagnerLyrics','Annotations_SingingVoice_Audio', 'Wagner_WWV086B-1_Karajan1966_LyricsAnnotated_Corrected.csv')
results_path = os.path.join('H:', os.sep, 'workspace','python','ISAD_wagner_vad','wagner_vad','results','split_b')
res_file = '0m_cnn_results.npy'
# res_file = 'lr_pcen_results.npy'
# res_file = 'pcen_results.npy'
# res_file = 'schl_model_results.npy'
gt_file = 'vad_true_targets.npy'
res_file_name = os.path.join(results_path, res_file)
gt_file_name = os.path.join(results_path, gt_file)
results = np.load(res_file_name)
ground_truth = np.load(gt_file_name)
annotationTable = pd.read_csv(annot_file_name, sep='/')
f_audio, Fs = librosa.load(audio_file_name, sr=None)
print('Length Results: '+str(results.shape[0]/(Fs*60))+' minutes.')
print('Length Annotations: '+str(ground_truth.shape[0]/(Fs*60))+' minutes.')
print('Length Audio: '+str(f_audio.shape[0]/(Fs*60))+' minutes.')
assert results.shape==ground_truth.shape, 'Results and ground truth have different shape!'
ipd.display(ipd.Audio(data=f_audio[0:(Fs*20)], rate=Fs))
for verification of numbers...
num_samples = ground_truth.shape[0]
ground_truth_inverted = 1-ground_truth
frame_rate = 10 # Hz
hop_length = Fs//frame_rate
length_in_frames = librosa.samples_to_frames(num_samples, hop_length=hop_length)
precision = np.dot(results, ground_truth)/np.sum(results)
recall = np.dot(results, ground_truth)/np.sum(ground_truth)
f_meas = 2*precision*recall / (precision+recall)
accuracy = np.sum(results==ground_truth)/num_samples
error_rate = np.sum(results==ground_truth_inverted)/num_samples
print('*** Sample-based results: ***')
print('Precision: %.3f ' % (precision))
print('Recall: %.3f ' % (recall))
print('F-Measure: %.3f' % (f_meas))
print('Accuracy: %.3f %%' % (100*accuracy))
print('Error rate: %.3f %%' % (100*error_rate))
for verification of numbers in frame-wise manner ...
results_fw = np.around(np.mean(librosa.util.frame(results, frame_length=hop_length, hop_length=hop_length), axis=0))
ground_truth_fw = np.around(np.mean(librosa.util.frame(ground_truth, frame_length=hop_length, hop_length=hop_length), axis=0))
ground_truth_inverted_fw = 1-ground_truth_fw
num_frames = results_fw.shape[0]
precision_fw = np.dot(results_fw, ground_truth_fw)/np.sum(results_fw)
recall_fw = np.dot(results_fw, ground_truth_fw)/np.sum(ground_truth_fw)
f_meas_fw = 2*precision_fw*recall_fw / (precision_fw+recall_fw)
accuracy_fw = np.sum(results_fw==ground_truth_fw)/num_frames
error_rate_fw = np.sum(results_fw==ground_truth_inverted_fw)/num_frames
print('*** Frame-based results: ***')
print('Precision: %.3f ' % (precision_fw))
print('Recall: %.3f ' % (recall_fw))
print('F-Measure: %.3f' % (f_meas_fw))
print('Accuracy: %.3f %%' % (100*accuracy_fw))
print('Error rate: %.3f %%' % (100*error_rate_fw))
true_pos = np.multiply((results==ground_truth), ground_truth)
false_neg = np.multiply((results==0), (ground_truth==1))
false_pos = np.multiply((results==1), (ground_truth==0))
print('false positives: %.1f %%' % (100*np.sum(false_pos)/num_samples))
print('false negatives: %.1f %%' % (100*np.sum(false_neg)/num_samples))
all_errors = np.reshape(results!=ground_truth, [1, num_samples])
ground_truth_labels = np.reshape(ground_truth, [1, num_samples])
# discrete_cmap = colors.ListedColormap([[1, 1, 1], [1, 0.3, 0.3], [1, 0.7, 0.7], [0, 0, 0]]) # "FMP-map"
discrete_cmap = colors.ListedColormap([[1, 1, 1], [0.5, 1, 0.5], [1, 0.5, 0.5], [0, 0, 0]])
blues = plt.cm.get_cmap('Blues')
reds = plt.cm.get_cmap('Reds')
t_min_minutes = 0
t_max_minutes = num_samples/(Fs*60)
plt.figure(figsize=(19, 8), dpi=50)
gs = gridspec.GridSpec(3, 2, width_ratios=[1, 0.02])
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[0, 1])
ax2 = plt.subplot(gs[1, 0])
ax3 = plt.subplot(gs[1, 1])
ax4 = plt.subplot(gs[2, 0])
ax5 = plt.subplot(gs[2, 1])
bounds = np.array([0, 1, 2, 3, 4])-0.5
norm = colors.BoundaryNorm(bounds, discrete_cmap.N)
boundsReds = np.array([0, 1, 2])-0.5
normReds = colors.BoundaryNorm(boundsReds, reds.N)
boundsBlues = np.array([0, 1, 2])-0.5
normBlues = colors.BoundaryNorm(boundsBlues, blues.N)
visualization = np.reshape(3 * true_pos + 2 * false_neg + 1 * false_pos, [1, num_samples])
im = ax0.imshow(visualization, origin='lower', aspect='auto', cmap=discrete_cmap, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax1, cmap=discrete_cmap, norm=norm, boundaries=bounds, ticks=[0, 1, 2, 3])
cbar.ax.set_yticklabels(['', 'FP', 'FN', 'TP'])
ax0.set_title('Detailed results')
im = ax2.imshow(all_errors, origin='lower', aspect='auto', cmap=reds, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax3, cmap=reds, norm=normReds, boundaries=boundsReds, ticks=[0, 1])
cbar.ax.set_yticklabels(['correct', 'wrong'])
ax2.set_xlabel('Time (minutes)')
ax2.set_title('All errors')
im = ax4.imshow(ground_truth_labels, origin='lower', aspect='auto', cmap=blues, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax5, cmap=blues, norm=normBlues, boundaries=boundsBlues, ticks=[0, 1])
cbar.ax.set_yticklabels(['no singing', 'singing'])
ax4.set_title('Ground truth annotations')
ax4.set_xlabel('Time (minutes)')
plt.tight_layout()
plt.show()
true_pos_fw = np.multiply((results_fw==ground_truth_fw), ground_truth_fw)
false_neg_fw = np.multiply((results_fw==0), (ground_truth_fw==1))
false_pos_fw = np.multiply((results_fw==1), (ground_truth_fw==0))
print('false positives: %.1f %%' % (100*np.sum(false_pos_fw)/num_frames))
print('false negatives: %.1f %%' % (100*np.sum(false_neg_fw)/num_frames))
all_errors_fw = np.reshape(results_fw!=ground_truth_fw, [1, num_frames])
ground_truth_labels_fw = np.reshape(ground_truth_fw, [1, num_frames])
# discrete_cmap = colors.ListedColormap([[1, 1, 1], [1, 0.3, 0.3], [1, 0.7, 0.7], [0, 0, 0]]) # "FMP-map"
discrete_cmap = colors.ListedColormap([[1, 1, 1], [0.5, 1, 0.5], [1, 0.5, 0.5], [0, 0, 0]])
blues = plt.cm.get_cmap('Blues')
reds = plt.cm.get_cmap('Reds')
t_min_minutes = 0
t_max_minutes = num_frames/(frame_rate*60)
plt.figure(figsize=(19, 8), dpi=50)
gs = gridspec.GridSpec(3, 2, width_ratios=[1, 0.02])
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[0, 1])
ax2 = plt.subplot(gs[1, 0])
ax3 = plt.subplot(gs[1, 1])
ax4 = plt.subplot(gs[2, 0])
ax5 = plt.subplot(gs[2, 1])
bounds = np.array([0, 1, 2, 3, 4])-0.5
norm = colors.BoundaryNorm(bounds, discrete_cmap.N)
visualization_fw = np.reshape(3 * true_pos_fw + 2 * false_neg_fw + 1 * false_pos_fw, [1, num_frames])
im = ax0.imshow(visualization_fw, origin='lower', aspect='auto', cmap=discrete_cmap, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax1, cmap=discrete_cmap, norm=norm, boundaries=bounds, ticks=[0, 1, 2, 3])
cbar.ax.set_yticklabels(['', 'FP', 'FN', 'TP'])
ax0.set_title('Detailed results')
im = ax2.imshow(all_errors_fw, origin='lower', aspect='auto', cmap=reds, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax3, cmap=reds, norm=normReds, boundaries=boundsReds, ticks=[0, 1])
cbar.ax.set_yticklabels(['correct', 'wrong'])
ax2.set_title('All errors')
im = ax4.imshow(ground_truth_labels_fw, origin='lower', aspect='auto', cmap=blues, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax5, cmap=blues, norm=normBlues, boundaries=boundsBlues, ticks=[0, 1])
cbar.ax.set_yticklabels(['no singing', 'singing'])
ax4.set_title('Ground truth annotations')
ax4.set_xlabel('Time (minutes)')
plt.tight_layout()
plt.show()
... containing also person and lyrics annotations
column_names = annotationTable.iloc[0]
ground_truth_from_table = np.zeros((1, length_in_frames))
activity_sieglinde = np.zeros((1, length_in_frames))
activity_siegmund = np.zeros((1, length_in_frames))
activity_hunding = np.zeros((1, length_in_frames))
for line_num in range(annotationTable.shape[0]):
time_start_sec = annotationTable.at[line_num,'Time_Start']
time_start_frames = librosa.time_to_frames(time_start_sec, sr=Fs, hop_length=hop_length) # -100000
time_end_sec = annotationTable.at[line_num,'Time_End']
time_end_frames = librosa.time_to_frames(time_end_sec, sr=Fs, hop_length=hop_length) +1 # +100000
ground_truth_from_table[:,time_start_frames:time_end_frames] = 1
if annotationTable.at[line_num,'Person'] in 'Sieglinde':
activity_sieglinde[:,time_start_frames:time_end_frames] = 1
elif annotationTable.at[line_num,'Person'] in 'Siegmund':
activity_siegmund[:,time_start_frames:time_end_frames] = 2
elif annotationTable.at[line_num,'Person'] in 'Hunding':
activity_hunding[:,time_start_frames:time_end_frames] = 3
activity_by_singer = np.concatenate((activity_hunding, activity_siegmund, activity_sieglinde), axis=0)
discrete_cmap2 = colors.ListedColormap([[1, 1, 1], [1, 0.4, 0.4], [0.1, 0.8, 0.6], [0.1, 0.6, 0.8]])
blues_discrete = colors.ListedColormap([[1, 1, 1], [0.2, 0.2, 1]])
plt.figure(figsize=(19, 10), dpi=50)
gs = gridspec.GridSpec(4, 2, width_ratios=[1, 0.02])
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[0, 1])
ax2 = plt.subplot(gs[1, 0])
ax3 = plt.subplot(gs[1, 1])
ax4 = plt.subplot(gs[2, 0])
ax5 = plt.subplot(gs[2, 1])
ax6 = plt.subplot(gs[3, 0])
ax7 = plt.subplot(gs[3, 1])
# show individual singers' plots sequentially
# im = ax0.imshow(activity_sieglinde, interpolation='none', origin='lower', aspect='auto', cmap=blues_discrete)#, extent=[t_min_minutes, t_max_minutes, 0, 1])
# cbar = plt.colorbar(im, cax=ax1, cmap=blues, ticks=[0, 1])
# cbar.ax.set_yticklabels(['no singing', 'singing'])
# ax2.set_xlabel('Time (minutes)')
# im = ax2.imshow(activity_siegmund, interpolation='none', origin='lower', aspect='auto', cmap=blues_discrete)#, extent=[t_min_minutes, t_max_minutes, 0, 1])
# cbar = plt.colorbar(im, cax=ax3, cmap=blues, ticks=[0, 1])
# cbar.ax.set_yticklabels(['no singing', 'singing'])
# ax2.set_xlabel('Time (minutes)')
# im = ax4.imshow(activity_hunding, interpolation='none', origin='lower', aspect='auto', cmap=blues_discrete)#, extent=[t_min_minutes, t_max_minutes, 0, 1])
# cbar = plt.colorbar(im, cax=ax5, cmap=blues, ticks=[0, 1])
# cbar.ax.set_yticklabels(['no singing', 'singing'])
# ax2.set_xlabel('Time (minutes)')
# show one multi-singer plot
bounds = np.array([0, 1, 2, 3, 4])-0.5
norm = colors.BoundaryNorm(bounds, discrete_cmap.N)
im = ax0.imshow(ground_truth_labels_fw, origin='lower', aspect='auto', cmap=blues, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax1, cmap=blues, norm=normBlues, boundaries=boundsBlues, ticks=[0, 1])
cbar.ax.set_yticklabels(['no singing', 'singing'])
ax0.set_title('Ground truth frame-wise')
im = ax2.imshow(ground_truth_from_table, origin='lower', aspect='auto', cmap=blues, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax3, cmap=blues, norm=normBlues, boundaries=boundsBlues, ticks=[0, 1])
cbar.ax.set_yticklabels(['no singing', 'singing'])
ax2.set_title('Ground truth from table')
im = ax4.imshow(activity_by_singer, origin='lower', aspect='auto', cmap=discrete_cmap2, extent=[t_min_minutes, t_max_minutes, 0, 3])
ax4.set_yticks(np.array([0, 1, 2])+0.5)
ax4.set_yticklabels(['Hunding', 'Siegmund', 'Sieglinde'])
cbar = plt.colorbar(im, cax=ax5, cmap=discrete_cmap, ticks=[0, 1, 2, 3], norm=norm, boundaries=bounds)
cbar.ax.set_yticklabels(['no_singing', 'Sieglinde', 'Siegmund', 'Hunding'])
cbar.ax.invert_yaxis()
ax4.set_title('Ground truth activity per singer')
im = ax6.imshow(visualization_fw, origin='lower', aspect='auto', cmap=discrete_cmap, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax7, cmap=discrete_cmap, norm=norm, boundaries=bounds, ticks=[0, 1, 2, 3])
cbar.ax.set_yticklabels(['', 'FP', 'FN', 'TP'])
ax6.set_title('Detailed frame-wise results')
ax6.set_xlabel('Time (minutes)')
plt.tight_layout()
plt.show()
# re-compute evaluation measures using ground truth from table
ground_truth_from_table = np.reshape(ground_truth_from_table, [num_frames, ])
ground_truth_inverted_table = 1-ground_truth_from_table
precision_fw = np.dot(results_fw, ground_truth_from_table)/np.sum(results_fw)
recall_fw = np.dot(results_fw, ground_truth_from_table)/np.sum(ground_truth_from_table)
f_meas_fw = 2*precision_fw*recall_fw / (precision_fw+recall_fw)
accuracy_fw = np.sum(results_fw==ground_truth_from_table)/num_frames
error_rate_fw = np.sum(results_fw==ground_truth_inverted_table)/num_frames
print('*** Frame-based results with annotations from table: ***')
print('Precision: %.3f ' % (precision_fw))
print('Recall: %.3f ' % (recall_fw))
print('F-Measure: %.3f' % (f_meas_fw))
print('Accuracy: %.3f %%' % (100*accuracy_fw))
print('Error rate: %.3f %%' % (100*error_rate_fw))
# From now on, use only ground truth fram table!
true_pos_fw = np.multiply((results_fw==ground_truth_from_table), ground_truth_from_table)
false_neg_fw = np.multiply((results_fw==0), (ground_truth_from_table==1))
false_pos_fw = np.multiply((results_fw==1), (ground_truth_from_table==0))
true_pos_sieglinde = np.multiply(true_pos_fw, (activity_sieglinde>0))
false_neg_sieglinde = np.multiply(false_neg_fw, (activity_sieglinde>0))
false_pos_sieglinde = false_pos_fw # cannot assign false positive activity to a singer!
true_pos_siegmund = np.multiply(true_pos_fw, (activity_siegmund>0))
false_neg_siegmund = np.multiply(false_neg_fw, (activity_siegmund>0))
false_pos_siegmund = false_pos_fw # cannot assign false positive activity to a singer!
true_pos_hunding = np.multiply(true_pos_fw, (activity_hunding>0))
false_neg_hunding = np.multiply(false_neg_fw, (activity_hunding>0))
false_pos_hunding = false_pos_fw # cannot assign false positive activity to a singer!
visualization_sieglinde = np.reshape(3 * true_pos_sieglinde + 2 * false_neg_sieglinde + 1 * false_pos_sieglinde, [1, num_frames])
visualization_siegmund = np.reshape(3 * true_pos_siegmund + 2 * false_neg_siegmund + 1 * false_pos_siegmund, [1, num_frames])
visualization_hunding = np.reshape(3 * true_pos_hunding + 2 * false_neg_hunding + 1 * false_pos_hunding, [1, num_frames])
visualization_per_singer = np.concatenate((visualization_hunding, visualization_siegmund, visualization_sieglinde), axis=0)
plt.figure(figsize=(19, 8), dpi=50)
gs = gridspec.GridSpec(3, 2, width_ratios=[1, 0.02])
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[0, 1])
ax2 = plt.subplot(gs[1, 0])
ax3 = plt.subplot(gs[1, 1])
ax4 = plt.subplot(gs[2, 0])
ax5 = plt.subplot(gs[2, 1])
im = ax0.imshow(activity_by_singer, origin='lower', aspect='auto', cmap=discrete_cmap2, extent=[t_min_minutes, t_max_minutes, 0, 3])
ax0.set_yticks(np.array([0, 1, 2])+0.5)
ax0.set_yticklabels(['Hunding', 'Siegmund', 'Sieglinde'])
cbar = plt.colorbar(im, cax=ax1, cmap=discrete_cmap, ticks=[0, 1, 2, 3], norm=norm, boundaries=bounds)
cbar.ax.set_yticklabels(['no_singing', 'Sieglinde', 'Siegmund', 'Hunding'])
cbar.ax.invert_yaxis()
ax0.set_title('Ground truth activity per singer')
im = ax2.imshow(visualization_per_singer, origin='lower', aspect='auto', cmap=discrete_cmap, extent=[t_min_minutes, t_max_minutes, 0, 3])
ax2.set_yticks(np.array([0, 1, 2])+0.5)
ax2.set_yticklabels(['Hunding', 'Siegmund', 'Sieglinde'])
cbar = plt.colorbar(im, cax=ax3, cmap=discrete_cmap, norm=norm, boundaries=bounds, ticks=[0, 1, 2, 3])
cbar.ax.set_yticklabels(['', 'FP', 'FN', 'TP'])
ax2.set_title('Frame-wise results per singer')
im = ax4.imshow(visualization_fw, origin='lower', aspect='auto', cmap=discrete_cmap, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax5, cmap=discrete_cmap, norm=norm, boundaries=bounds, ticks=[0, 1, 2, 3])
cbar.ax.set_yticklabels(['', 'FP', 'FN', 'TP'])
ax4.set_title('Frame-wise results')
ax4.set_xlabel('Time (minutes)')
plt.tight_layout()
plt.show()
recall_sieglinde = np.sum(true_pos_sieglinde)/(np.sum(true_pos_sieglinde)+np.sum(false_neg_sieglinde))
recall_siegmund = np.sum(true_pos_siegmund)/(np.sum(true_pos_siegmund)+np.sum(false_neg_siegmund))
recall_hunding = np.sum(true_pos_hunding)/(np.sum(true_pos_hunding)+np.sum(false_neg_hunding))
# all_errors = (results_fw!=ground_truth_fw)
all_errors = false_neg_fw + false_pos_fw
falseneg_rate_sieglinde = np.sum(false_neg_sieglinde)/num_frames
falseneg_rate_siegmund = np.sum(false_neg_siegmund)/num_frames
falseneg_rate_hunding = np.sum(false_neg_hunding)/num_frames
falseneg_rate_allsingers = np.sum(false_neg_fw)/num_frames
falsepos_rate = np.sum(false_pos_fw)/num_frames
error_rate_allsingers = np.sum(all_errors)/num_frames
annotated_frames = np.array([num_frames, np.sum(activity_sieglinde>0), np.sum(activity_siegmund>0), np.sum(activity_hunding>0)])
annotated_frames_relative = 100*annotated_frames/num_frames
falseneg_rates = 100*np.array([falseneg_rate_allsingers, falseneg_rate_sieglinde, falseneg_rate_siegmund, falseneg_rate_hunding])
falsepos_rates = 100*np.array([falsepos_rate, 0, 0, 0])
recalls = 100*np.array([recall_fw, recall_sieglinde, recall_siegmund, recall_hunding])
precisions = 100*np.array([precision_fw, 0, 0, 0])
fmeas_all = 100*np.array([f_meas_fw, 0, 0, 0])
pd.set_option('display.max_colwidth', -1)
df = pd.DataFrame(OrderedDict([('# frames (annotated)', annotated_frames),
('% frames (relative)', annotated_frames_relative),
('False Negative Rate (%)', falseneg_rates),
('False Positive Rate (%)', falsepos_rates),
('Recall (%)', recalls),
('Precision (%)', precisions),
('F-Measure (%)', fmeas_all)
]))
df.index = ['All frames', 'Sieglinde', 'Siegmund', 'Hunding']
ipd.HTML(df.to_html(escape=False, float_format='%.2f'))
from scipy.signal import medfilt as medfilt
filter_lengths = np.arange(1,99,2)
precision_sweep = np.zeros(filter_lengths.shape)
recall_sweep = np.zeros(filter_lengths.shape)
fmeas_sweep = np.zeros(filter_lengths.shape)
accuracy_sweep = np.zeros(filter_lengths.shape)
i=0
for l in filter_lengths:
results_filtered = medfilt(results_fw, kernel_size=l)
precision_sweep[i] = np.dot(results_filtered, ground_truth_from_table)/np.sum(results_filtered)
recall_sweep[i] = np.dot(results_filtered, ground_truth_from_table)/np.sum(ground_truth_from_table)
fmeas_sweep[i] = 2*precision_sweep[i]*recall_sweep[i] / (precision_sweep[i]+recall_sweep[i])
accuracy_sweep[i] = np.sum(results_filtered==ground_truth_from_table)/num_frames
# print('Filter length %2.f done. Fmeas=%.2f ' % (l, 100*fmeas_sweep[i]))
i+=1
fig = plt.figure(figsize=(12, 5), dpi=80)
ax = fig.add_subplot(1,1,1)
ax.plot(filter_lengths, precision_sweep, marker=None, color='blue', linewidth=2, linestyle='dashed')
ax.plot(filter_lengths, recall_sweep, marker=None, color='green', linewidth=2, linestyle='dotted')
ax.plot(filter_lengths, fmeas_sweep, marker=None, color='red', linewidth=2)
ax.plot(filter_lengths, accuracy_sweep, marker=None, color='black', linewidth=2)
ax.legend(('Precision', 'Recall', 'F-Measure', 'Accuracy'))
ax.set_xlabel('Median filter length')
plt.tight_layout()
plt.show()
import itertools
import operator
# we use the median-filtered results
kernel_size = 25
results_filtered = medfilt(results_fw, kernel_size=kernel_size)
true_pos_filtered = np.multiply((results_filtered==ground_truth_from_table), ground_truth_from_table)
false_neg_filtered = np.multiply((results_filtered==0), (ground_truth_from_table==1))
false_pos_filtered = np.multiply((results_filtered==1), (ground_truth_from_table==0))
print('false positives: %.1f %%' % (100*np.sum(false_pos_fw)/num_frames))
print('false negatives: %.1f %%' % (100*np.sum(false_neg_fw)/num_frames))
all_errors_filtered = np.reshape(results_filtered!=ground_truth_from_table, [1, num_frames])
ground_truth_labels_fw = np.reshape(ground_truth_from_table, [1, num_frames])
false_pos_blocks = [[i for i,value in it] for key,it in itertools.groupby(enumerate(false_pos_filtered), key=operator.itemgetter(1)) if key != 0]
false_neg_blocks = [[i for i,value in it] for key,it in itertools.groupby(enumerate(false_neg_filtered), key=operator.itemgetter(1)) if key != 0]
print('')
print('***************************************************************************************')
print('* EXAMPLES - FALSE POSITIVES **********************************************************')
print('***************************************************************************************')
print('')
min_block_length = 35 # minimal number of wrong frames
for block in range(len(false_pos_blocks)):
if len(false_pos_blocks[block])>=min_block_length:
start_frame = false_pos_blocks[block][0]
end_frame = false_pos_blocks[block][-1]
start_sample = librosa.frames_to_samples(start_frame, hop_length=Fs/frame_rate)
end_sample = librosa.frames_to_samples(end_frame, hop_length=Fs/frame_rate)
print('duration: '+str((end_sample-start_sample)/Fs)+' seconds, start time '+str(start_sample/(Fs*60))+' minutes')
ipd.display(ipd.Audio(data=f_audio[start_sample:end_sample], rate=Fs))
print('')
print('***************************************************************************************')
print('* EXAMPLES - FALSE NEGATIVES **********************************************************')
print('***************************************************************************************')
print('')
for block in range(len(false_neg_blocks)):
if len(false_neg_blocks[block])>=min_block_length:
start_frame = false_neg_blocks[block][0]
end_frame = false_neg_blocks[block][-1]
start_sample = librosa.frames_to_samples(start_frame, hop_length=Fs/frame_rate)
end_sample = librosa.frames_to_samples(end_frame, hop_length=Fs/frame_rate)
print('duration: '+str((end_sample-start_sample)/Fs)+' seconds, start time '+str(start_sample/(Fs*60))+' minutes')
ipd.display(ipd.Audio(data=f_audio[start_sample:end_sample], rate=Fs))
import scipy.io as sio
instrument_file_path = os.path.join('H:', os.sep, 'workspace','matlab','Wagner_Sync_Output','instfamiliesKarajan.mat')
mat_content = sio.loadmat(instrument_file_path)
instrument_fam_activities = mat_content['familyActivities'][:,0:-1]
three_fam_activities = np.multiply(np.reshape(np.array(([1, 2, 3])),[3, 1]), instrument_fam_activities[np.array(([5, 1, 0])),:])
plt.figure(figsize=(19, 10), dpi=50)
gs = gridspec.GridSpec(3, 2, width_ratios=[1, 0.02])
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[0, 1])
ax2 = plt.subplot(gs[1, 0])
ax3 = plt.subplot(gs[1, 1])
ax4 = plt.subplot(gs[2, 0])
ax5 = plt.subplot(gs[2, 1])
im = ax0.imshow(activity_by_singer, origin='lower', aspect='auto', cmap=discrete_cmap2, extent=[t_min_minutes, t_max_minutes, 0, 3])
ax0.set_yticks(np.array([0, 1, 2])+0.5)
ax0.set_yticklabels(['Hunding', 'Siegmund', 'Sieglinde'])
cbar = plt.colorbar(im, cax=ax1, cmap=discrete_cmap, ticks=[0, 1, 2, 3], norm=norm, boundaries=bounds)
cbar.ax.set_yticklabels(['no_singing', 'Sieglinde', 'Siegmund', 'Hunding'])
cbar.ax.invert_yaxis()
ax0.set_title('Ground truth activity per singer')
im = ax2.imshow(visualization_per_singer, origin='lower', aspect='auto', cmap=discrete_cmap, extent=[t_min_minutes, t_max_minutes, 0, 3])
ax2.set_yticks(np.array([0, 1, 2])+0.5)
ax2.set_yticklabels(['Hunding', 'Siegmund', 'Sieglinde'])
cbar = plt.colorbar(im, cax=ax3, cmap=discrete_cmap, norm=norm, boundaries=bounds, ticks=[0, 1, 2, 3])
cbar.ax.set_yticklabels(['', 'FP', 'FN', 'TP'])
ax2.set_title('Frame-wise results per singer')
im = ax4.imshow(three_fam_activities, origin='lower', aspect='auto', cmap=discrete_cmap2, extent=[t_min_minutes, t_max_minutes, 0, 3])
ax4.set_yticks(np.array([0, 1, 2])+0.5)
ax4.set_yticklabels(['Strings', 'Brass', 'Woodwinds'])
cbar = plt.colorbar(im, cax=ax5, cmap=discrete_cmap, ticks=[0, 1, 2, 3], norm=norm, boundaries=bounds)
cbar.ax.set_yticklabels(['silent', 'Strings', 'Brass', 'Woodwinds'])
#cbar.ax.invert_yaxis()
ax0.set_title('Ground truth activity per singer')
plt.tight_layout()
plt.show()
woodwind_activities = mat_content['familyActivities'][0,0:-1]
brass_activities = mat_content['familyActivities'][1,0:-1]
strings_activities = mat_content['familyActivities'][5,0:-1]
singing_activities = mat_content['familyActivities'][4,0:-1]
woodwind_minutes = np.sum(woodwind_activities)/(60*frame_rate)
brass_minutes = np.sum(brass_activities)/(60*frame_rate)
strings_minutes = np.sum(strings_activities)/(60*frame_rate)
singing_minutes = np.sum(singing_activities)/(60*frame_rate)
plt.figure(figsize=(10, 3), dpi=50)
plt.bar(np.arange(4), height=[woodwind_minutes, brass_minutes, strings_minutes, singing_minutes])
plt.xticks(np.arange(4), ['Woodwinds', 'Brass', 'Strings', 'Singing'])
plt.rcParams.update({'font.size': 22})
plt.ylabel('Activity (minutes)')
plt.title('Total activites')
#plt.tight_layout()
plt.show()
%matplotlib inline
print(ground_truth_from_table.shape)
print(singing_activities.shape)
validate_singing_annot = np.sum(singing_activities==ground_truth_from_table)/num_frames
print(validate_singing_annot)
# for offset in range(40):
# singing_activities_nooffset = np.concatenate((np.zeros((offset)), singing_activities[:-offset]), axis=0)
# validate_singing_corrected = np.sum(singing_activities_nooffset==ground_truth_from_table)/ground_truth_from_table.shape[0]
# print('offset '+str(offset)+' score: '+str(validate_singing_corrected))
plt.figure(figsize=(25, 5), dpi=50)
gs = gridspec.GridSpec(2, 2, width_ratios=[1, 0.02])
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[0, 1])
ax2 = plt.subplot(gs[1, 0])
ax3 = plt.subplot(gs[1, 1])
im = ax0.imshow(np.reshape(singing_activities, [1, num_frames]), origin='lower', aspect='auto', cmap=blues, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax1, cmap=blues, norm=normBlues, boundaries=boundsBlues, ticks=[0, 1])
cbar.ax.set_yticklabels(['no singing', 'singing'])
ax0.set_title('Ground truth from instrument annotations')
im = ax2.imshow(np.reshape(ground_truth_from_table, [1, num_frames]), origin='lower', aspect='auto', cmap=blues, extent=[t_min_minutes, t_max_minutes, 0, 1])
cbar = plt.colorbar(im, cax=ax3, cmap=blues, norm=normBlues, boundaries=boundsBlues, ticks=[0, 1])
cbar.ax.set_yticklabels(['no singing', 'singing'])
ax2.set_title('Ground truth from table')
ax2.set_xlabel('Time (minutes)')
# plt.tight_layout()
plt.show()
# Export singing voice regions from instrument annotations for validation
singing_blocks = [[i for i,value in it] for key,it in itertools.groupby(enumerate(singing_activities), key=operator.itemgetter(1)) if key != 0]
df = pd.DataFrame([[0, 0]], columns=['start_time', 'end_time'])
for block in range(len(singing_blocks)):
start_frame = singing_blocks[block][0]
end_frame = singing_blocks[block][-1]
start_time = librosa.frames_to_time(start_frame, sr=Fs, hop_length=Fs/frame_rate)
end_time = librosa.frames_to_time(end_frame, sr=Fs, hop_length=Fs/frame_rate)
df = df.append(pd.DataFrame([[start_time, end_time]], columns=['start_time', 'end_time']))
csv_path = os.path.join('H:', os.sep, 'workspace','python','ISAD_wagner_vad','test_annot.csv')
df.to_csv(csv_path)
fp_woodwind = np.dot(woodwind_activities, 1*false_pos_filtered)/(60*frame_rate)
fp_brass = np.dot(brass_activities, 1*false_pos_filtered)/(60*frame_rate)
fp_strings = np.dot(strings_activities, 1*false_pos_filtered)/(60*frame_rate)
fn_woodwind = np.dot(woodwind_activities, 1*false_neg_filtered)/(60*frame_rate)
fn_brass = np.dot(brass_activities, 1*false_neg_filtered)/(60*frame_rate)
fn_strings = np.dot(strings_activities, 1*false_neg_filtered)/(60*frame_rate)
fig = plt.figure(figsize=(10, 12), dpi=50)
gs = gridspec.GridSpec(3, 1)
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[1, 0])
ax2 = plt.subplot(gs[2, 0])
im = ax0.bar(np.arange(4), height=[woodwind_minutes, brass_minutes, strings_minutes, singing_minutes])
ax0.set_xticklabels(['', 'Woodwinds', 'Brass', 'Strings', 'Singing'])
plt.rcParams.update({'font.size': 22})
ax0.set_ylabel('Activity (minutes)')
ax0.set_title('Total Instrument Activites')
im = ax1.bar(np.arange(4), height=[fp_woodwind, fp_brass, fp_strings, 0])
ax1.set_xticklabels(['', 'Woodwinds', 'Brass', 'Strings', 'Singing'])
ax1.set_ylabel('Activity (minutes)')
ax1.set_title('False Positive Regions - Instrument Activites')
im = ax2.bar(np.arange(4), height=[fn_woodwind, fn_brass, fn_strings, 0])
ax2.set_xticklabels(['', 'Woodwinds', 'Brass', 'Strings', 'Singing'])
ax2.set_ylabel('Activity (minutes)')
ax2.set_title('False Negative Regions - Instrument Activites')
fig.subplots_adjust(hspace=0.5)
#plt.tight_layout()
plt.show()
import scipy as sp
woodwind_only = woodwind_activities*sp.logical_not(brass_activities)*sp.logical_not(strings_activities)
brass_only = brass_activities*sp.logical_not(woodwind_activities)*sp.logical_not(strings_activities)
strings_only = strings_activities*sp.logical_not(brass_activities)*sp.logical_not(woodwind_activities)
woodwind_brass = np.multiply(woodwind_activities, brass_activities)*sp.logical_not(strings_activities)
woodwind_strings = np.multiply(woodwind_activities, strings_activities)*sp.logical_not(brass_activities)
strings_brass = np.multiply(strings_activities, brass_activities)*sp.logical_not(woodwind_activities)
all_families = np.multiply(np.multiply(strings_activities, brass_activities), woodwind_activities)
# total
min_woodwind_only = np.sum(1*woodwind_only)/(60*frame_rate)
min_brass_only = np.sum(1*brass_only)/(60*frame_rate)
min_strings_only = np.sum(1*strings_only)/(60*frame_rate)
min_woodwind_brass = np.sum(1*woodwind_brass)/(60*frame_rate)
min_woodwind_strings = np.sum(1*woodwind_strings)/(60*frame_rate)
min_strings_brass = np.sum(1*strings_brass)/(60*frame_rate)
min_all_families = np.sum(1*all_families)/(60*frame_rate)
# false pos
fp_woodwind_only = np.dot(1*woodwind_only, 1*false_pos_filtered)/(60*frame_rate)
fp_brass_only = np.dot(1*brass_only, 1*false_pos_filtered)/(60*frame_rate)
fp_strings_only = np.dot(1*strings_only, 1*false_pos_filtered)/(60*frame_rate)
fp_woodwind_brass = np.dot(1*woodwind_only, 1*false_pos_filtered)/(60*frame_rate)
fp_woodwind_strings = np.dot(1*woodwind_strings, 1*false_pos_filtered)/(60*frame_rate)
fp_strings_brass = np.dot(1*strings_brass, 1*false_pos_filtered)/(60*frame_rate)
fp_all_families = np.dot(1*all_families, 1*false_pos_filtered)/(60*frame_rate)
# false neg
fn_woodwind_only = np.dot(1*woodwind_only, 1*false_neg_filtered)/(60*frame_rate)
fn_brass_only = np.dot(1*brass_only, 1*false_neg_filtered)/(60*frame_rate)
fn_strings_only = np.dot(1*strings_only, 1*false_neg_filtered)/(60*frame_rate)
fn_woodwind_brass = np.dot(1*woodwind_only, 1*false_neg_filtered)/(60*frame_rate)
fn_woodwind_strings = np.dot(1*woodwind_strings, 1*false_neg_filtered)/(60*frame_rate)
fn_strings_brass = np.dot(1*strings_brass, 1*false_neg_filtered)/(60*frame_rate)
fn_all_families = np.dot(1*all_families, 1*false_neg_filtered)/(60*frame_rate)
# check total duration of all combinations
print(np.sum([min_woodwind_only, min_brass_only, min_strings_only, min_woodwind_brass, min_woodwind_strings, min_strings_brass, min_all_families]))
fig = plt.figure(figsize=(25, 12), dpi=50)
gs = gridspec.GridSpec(3, 1)
ax0 = plt.subplot(gs[0, 0])
ax1 = plt.subplot(gs[1, 0])
ax2 = plt.subplot(gs[2, 0])
im = ax0.bar(np.arange(7), height=[min_woodwind_only, min_brass_only, min_strings_only, min_woodwind_brass, min_woodwind_strings, min_strings_brass, min_all_families])
ax0.set_xticklabels(['', 'Wood only', 'Brass only', 'Strings only', 'Wood&Brass', 'Wood&Strings', 'Strings&Brass', 'All three'])
plt.rcParams.update({'font.size': 22})
ax0.set_ylabel('Activity (minutes)')
ax0.set_title('Total Instrument Activites')
im = ax1.bar(np.arange(7), height=[fp_woodwind_only, fp_brass_only, fp_strings_only, fp_woodwind_brass, fp_woodwind_strings, fp_strings_brass, fp_all_families])
ax1.set_xticklabels(['', 'Wood only', 'Brass only', 'Strings only', 'Wood&Brass', 'Wood&Strings', 'Strings&Brass', 'All three'])
ax1.set_ylabel('Activity (minutes)')
ax1.set_title('False Positive Regions - Instrument Activites')
im = ax2.bar(np.arange(7), height=[fn_woodwind_only, fn_brass_only, fn_strings_only, fn_woodwind_brass, fn_woodwind_strings, fn_strings_brass, fn_all_families])
ax2.set_xticklabels(['', 'Wood only', 'Brass only', 'Strings only', 'Wood&Brass', 'Wood&Strings', 'Strings&Brass', 'All three'])
ax2.set_ylabel('Activity (minutes)')
ax2.set_title('False Negative Regions - Instrument Activites')
fig.subplots_adjust(hspace=0.5)
#plt.tight_layout()
plt.show()
volume_framewise = librosa.feature.rmse(f_audio, frame_length=Fs//frame_rate, hop_length=Fs//frame_rate).T
print(volume_framewise.shape)
plt.figure(figsize=(25, 5), dpi=50)
plt.plot(np.arange(volume_framewise.shape[0]), volume_framewise)
plt.show()
volume_singing = volume_framewise[ground_truth_from_table==1]
volume_nosinging = volume_framewise[ground_truth_inverted_table==1]
volume_fp = volume_framewise[false_pos_filtered==1]
volume_fn = volume_framewise[false_neg_filtered==1]
volume_average = np.mean(volume_framewise)
volume_singing_av = np.dot(volume_framewise.T, 1*ground_truth_from_table)/np.sum(1*ground_truth_from_table)
volume_nosinging_av = np.dot(volume_framewise.T, 1*ground_truth_inverted_table)/np.sum(1*ground_truth_inverted_table)
volume_fp_av = np.dot(volume_framewise.T, 1*false_pos_filtered)/np.sum(1*false_pos_filtered)
volume_fn_av = np.dot(volume_framewise.T, 1*false_neg_filtered)/np.sum(1*false_neg_filtered)
print('Average volume (all frames): %.3f' % volume_average)
print('Average volume (singing): %.3f' % volume_singing_av[0])
print('Average volume (no singing): %.3f' % volume_nosinging_av[0])
print('Average volume (false positives): %.3f' % volume_fp_av[0])
print('Average volume (false negatives): %.3f' % volume_fn_av[0])
fig = plt.figure(figsize=(25, 12), dpi=50)
gs = gridspec.GridSpec(3, 2, width_ratios=[1, 1])
ax0 = plt.subplot(gs[0, 0])
ax2 = plt.subplot(gs[1, 0])
ax3 = plt.subplot(gs[1, 1])
ax4 = plt.subplot(gs[2, 0])
ax5 = plt.subplot(gs[2, 1])
im = ax0.hist(volume_framewise, bins=np.arange(0, 0.15, 0.001))
ax0.set_title('RMSE - All frames')
im = ax2.hist(volume_singing, bins=np.arange(0, 0.15, 0.001))
ax2.set_title('RMSE - Singing')
im = ax3.hist(volume_nosinging, bins=np.arange(0, 0.15, 0.001))
ax3.set_title('RMSE - No Singing')
im = ax4.hist(volume_fp, bins=np.arange(0, 0.15, 0.001))
ax4.set_title('RMSE - False Positives')
im = ax5.hist(volume_fn, bins=np.arange(0, 0.15, 0.001))
ax5.set_title('RMSE - False Negatives')
# im = ax1.bar(np.arange(7), height=[fp_woodwind_only, fp_brass_only, fp_strings_only, fp_woodwind_brass, fp_woodwind_strings, fp_strings_brass, fp_all_families])
# ax1.set_xticklabels(['', 'Wood only', 'Brass only', 'Strings only', 'Wood&Brass', 'Wood&Strings', 'Strings&Brass', 'All three'])
# ax1.set_ylabel('Activity (minutes)')
# ax1.set_title('False Positive Regions - Instrument Activites')
# im = ax2.bar(np.arange(7), height=[fn_woodwind_only, fn_brass_only, fn_strings_only, fn_woodwind_brass, fn_woodwind_strings, fn_strings_brass, fn_all_families])
# ax2.set_xticklabels(['', 'Wood only', 'Brass only', 'Strings only', 'Wood&Brass', 'Wood&Strings', 'Strings&Brass', 'All three'])
# ax2.set_ylabel('Activity (minutes)')
# ax2.set_title('False Negative Regions - Instrument Activites')
fig.subplots_adjust(hspace=0.5)
#plt.tight_layout()
plt.show()